library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(corrplot)
## corrplot 0.92 loaded
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
  1. (10 points). Compute the \(3 × 3\) partial correlation matrix for Revenue and truck drivers example using two methods: by inverse correlation matrix and correlation of residuals. Make sure that the two matrices coincide. Interpret the result in layman terms.

##inverse correlation matrix

truckDrivers<-read_csv("/Users/parinithakompala/Desktop/QBS124/ass3/truckR.data.csv")
## Rows: 20 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): truc.dr, revenue
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
n=nrow(truckDrivers)
truckDrivers$time=1:n

head(truckDrivers)
mat<-cor(truckDrivers) # cor matrix
mat
##           truc.dr   revenue      time
## truc.dr 1.0000000 0.9423984 0.9539531
## revenue 0.9423984 1.0000000 0.9672527
## time    0.9539531 0.9672527 1.0000000
mat_inv<-solve(mat) #inverse
mat_inv
##           truc.dr    revenue       time
## truc.dr 11.910612  -3.639377  -7.841967
## revenue -3.639377  16.634643 -12.618108
## time    -7.841967 -12.618108  20.685768
#R=cor(LRT)
R=mat
iR=iiR=solve(R)
ns=3
for(i in 1:ns)
  for(j in 1:ns)
    iR[i,j]=-iiR[i,j]/sqrt(abs(iiR[i,i]*iiR[j,j]))
    diag(iR)=rep(NA,ns)
    image(1:ns,1:ns,iR,col=c("cyan","goldenrod1","yellow","pink2"),breaks=c(-.25,.5,.35,.5,.75),xlab="",ylab="",axes=F)
## Warning in image.default(1:ns, 1:ns, iR, col = c("cyan", "goldenrod1",
## "yellow", : unsorted 'breaks' will be sorted before use
    axis(side=1,at=1:ns,labels=names(truckDrivers),font=2)
    axis(side=2,at=1:ns,labels=names(truckDrivers),srt=90,font=2)
    axis(side=3,at=1:ns,labels=names(truckDrivers),font=2)
    axis(side=4,at=1:ns,labels=names(truckDrivers),font=2)
    for(i in 1:ns)
        text(rep(i,ns),1:ns,round(iR[i,],2),font=2)

##Correlation of residuals

##Alternatively, partial correlation coefficient can be computed as the correlation between residuals from regressions X on Z and Y on Z..

#x=TruckDrivers,Y=revenue,z=time
x=lm(truc.dr~time,data=truckDrivers)$residuals
x
##           1           2           3           4           5           6 
## -0.63289671  0.01947453  0.55375202 -0.87420453  0.45667373  0.27397840 
##           7           8           9          10          11          12 
##  0.31223574  1.32639177 -1.02611231  1.44307738 -0.58610939 -0.98958281 
##          13          14          15          16          17          18 
## -0.59175918  0.36021523  0.24257212 -0.23416646 -0.89656382 -0.60882574 
##          19          20 
##  1.24669424  0.20515578
y=lm(revenue~time,data=truckDrivers)$residuals
y
##          1          2          3          4          5          6          7 
## -1.3752059 -1.7803210  1.9879346 -6.9735547  0.1616912 -1.4847593  5.9439196 
##          8          9         10         11         12         13         14 
##  7.1234474 -2.1727327 -1.7477582  2.0889068  0.8319533  0.8767898  2.5612970 
##         15         16         17         18         19         20 
## -2.2831059  0.7394079  1.5393935 -3.1881438 -3.8018366  0.9526769
M=(cor(x,y))
#reff- https://towardsdatascience.com/keeping-an-eye-on-confounds-a-walk-through-for-calculating-a-partial-correlation-matrix-2ac6b831c5b6

partialCors <- list()

truckColNames <- colnames(truckDrivers)

for (i in 1:length(truckColNames)) {
  
  y <- truckColNames[i]
  
  covariatesAll <- truckColNames[!(truckColNames %in% y)]
  
  crntPcor <- double()
  
  for (j in 1:length(covariatesAll)) {
  
    covarLeftOut <- covariatesAll[j]
    
    covariatesCrnt <- covariatesAll[!(covariatesAll %in% covarLeftOut)]
    
    rhs <- paste(covariatesCrnt, collapse = " + ")
    
    lhs <- paste(y, "~")
    
    frmla <- as.formula(paste(lhs, rhs))
    
    mod1 <- lm(frmla, data = truckDrivers)
    
    R1 <- resid(mod1)
    
    lhs <- paste(covarLeftOut, "~")
    
    frmla <- as.formula(paste(lhs, rhs))
    
    mod2 <- lm(frmla, data = truckDrivers)
    
    R2 <- resid(mod2)
    
    crntPcor[j] <- cor(R1,R2)
  
  }  
    
  partialCors[[i]] <- append(crntPcor, 1 , i-1)
  
}

partialCorMat <- matrix(unlist(partialCors), 
                        ncol = length(truckColNames), 
                        nrow = length(truckColNames), 
                        dimnames = list(truckColNames, truckColNames ))

partialCorMat
##           truc.dr   revenue      time
## truc.dr 1.0000000 0.2585552 0.4995997
## revenue 0.2585552 1.0000000 0.6802236
## time    0.4995997 0.6802236 1.0000000
iR<-partialCorMat
ns=3
diag(iR)=rep(NA,ns)
    image(1:ns,1:ns,iR,col=c("cyan","goldenrod1","yellow","pink2"),breaks=c(-.25,.5,.35,.5,.75),xlab="",ylab="",axes=F)
## Warning in image.default(1:ns, 1:ns, iR, col = c("cyan", "goldenrod1",
## "yellow", : unsorted 'breaks' will be sorted before use
    axis(side=1,at=1:ns,labels=names(truckDrivers),font=2)
    axis(side=2,at=1:ns,labels=names(truckDrivers),srt=90,font=2)
    axis(side=3,at=1:ns,labels=names(truckDrivers),font=2)
    axis(side=4,at=1:ns,labels=names(truckDrivers),font=2)
    for(i in 1:ns)
        text(rep(i,ns),1:ns,round(iR[i,],2),font=2)

  1. (5 points). Explain in layman language false correlation referring to Example 1.

Answer- According to Example 1, we can see tha U is the comman factor that control the variation in X and Y ans also causes his correlation between X and Y .Z,Q,U are independent. The Variance of U is the strength of influenze of U over X and Y, here we can have var(U)=\(\sigma^2\), if \(\sigma^2\)=0 then X and Y will be independent, cor=0.So we can say cor(X,Y) id the fn of \(\sigma^2\).If \(\sigma^2=\infty\), cor=1, everything will be dominated by U.As we know covariance is linear fn. Correlation between X and Y is positive.We should exclude U to refine the correlation between X and Y.For this we need to consider “partial correlation”.As said,“Every time you see an expected high correlation it may be caused by the presence of a common factor that influences both variables”.In this case it is U. We need to regress x on other variables and regress Y on other variables, by this we exclude the influses by other variables and just can kind correlation between X and Y.So we can do this or do inverse correlation matrix to exclude the commen variable and get the corr between X and Y. In other words lets say The strength of a relationship between two variables is measured using partial correlation, which accounts for the effect of one or more other factors. For example, while controlling for weight and exercise (U), you would wish to investigate if there is a link between the amount of food consumed(X) and blood pressure (Y). Multiple variables can be controlled at the same time (called control variables or covariates). More than one or two control variables, on the other hand, are usually not suggested because the more control variables you have, the less reliable your test will be. There is one continuous independent variable (the x value) and one continuous dependent variable (the y value) in partial correlation.This is the same as when performing a standard correlation analysis. The dependent variable in the blood pressure example is “amount of food eaten,” while the independent variable is “blood pressure.” Weight and amount of exercise should also be continuous control variables.

  1. (10 points).
skelet<-read.csv("/Users/parinithakompala/Desktop/QBS124/ass4/Goldman.csv")

head(skelet)
dim(skelet)
## [1] 1538   69
  1. Remove the columns that have \(-1\) or 1 correlation with others as follows from skelet_2.pdf.
d=read.csv("/Users/parinithakompala/Desktop/QBS124/ass4/Goldman.csv",stringsAsFactors=F)
female=as.numeric(d[,3])
## Warning: NAs introduced by coercion
d=d[,18:ncol(d)]
#d = select(d, -McH.FHD, -AVG.FHD) #removing the columns
d <- subset (d, select = -McH.FHD)
d <- subset (d, select = -AVG.FHD)
nm=names(d)
nc=ncol(d);nr=nrow(d)   
for(i in 1:nc)
if(sum(is.na(d[,i]))==nr) {alln=i;break}
d=d[,-alln]
nm=nm[-alln]
nc=ncol(d)
R=cor(d,use="pairwise.complete.obs")
n=nrow(R)
par(mfrow=c(1,1),mar=c(0,1,2,1))
        cl=c("deepskyblue","lightblue","green","yellow","red")
        image(1:n,1:n,breaks=c(-.75,-.5,0,.5,.75,1),ylim=c(-5,n+1),xlim=c(-2,n+.5),col=cl,ylab="",xlab="",R,axes=F) 
        text(1:n,rep(0.5,n),nm,adj=1,cex=.75,srt=45)
        text(rep(.3,n),1:n,nm,adj=1,cex=.75)
        for(i in 1:n)
        for(j in 1:n)
        text(i,j,round(R[i,j],2),cex=.5)
        mtext(side=3,paste("Correlation heatmap of",n,"osteometric measurements taken from 1538 human skeletons"),cex=2) 
        br=c("-0.75 to -0.5","-0.5 to 0","0 to 0.5","0.5 to 0.75","0.75 to 1.0")
        legend(11,-2,br,col=cl,pch=15,horiz=T,cex=1.25)

  1. Apply option use=“complete.obs” when computing the regular correlation matrix and then regularize it by adding \(10^{−20}\) to the diagonal elements.
d=as.matrix(d)
M=cor(d, method = "pearson", use = "complete.obs")
corrplot(M, method = 'number')

  1. Compute and display the partial correlation matrix. Use your own breaks and colors (see Rcolor.pdf) to cover the range of correlation coefficients from \(-1\) to 1.
par(mfrow=c(1,1),mar=c(0,1,2,1))
        cl=c("cadetblue1","plum2","darkolivegreen1","darkslategray1","deeppink")
        image(1:n,1:n,breaks=c(-.75,-.5,0,.5,.75,1),ylim=c(-5,n+1),xlim=c(-2,n+.5),col=cl,ylab="",xlab="",R,axes=F) 
        text(1:n,rep(0.5,n),nm,adj=1,cex=.75,srt=45)
        text(rep(.3,n),1:n,nm,adj=1,cex=.75)
        for(i in 1:n)
        for(j in 1:n)
        text(i,j,round(R[i,j],2),cex=.5)
        mtext(side=3,paste("Correlation heatmap of",n,"osteometric measurements taken from 1538 human skeletons"),cex=2) 
        br=c("-0.75 to -0.5","-0.5 to 0","0 to 0.5","0.5 to 0.75","0.75 to 1.0")
        legend(11,-2,br,col=cl,pch=15,horiz=T,cex=1.25)

  1. Display the partial correlation matrix using pheatmap package.
# load package
library(pheatmap)
# d=read.csv("/Users/parinithakompala/Desktop/QBS124/ass4/Goldman.csv",stringsAsFactors=F)
# female=as.numeric(d[,3])
# d=d[,18:ncol(d)]
# #d = select(d, -McH.FHD, -AVG.FHD) #removing the columns
# d <- subset (d, select = -McH.FHD)
# d <- subset (d, select = -AVG.FHD)
# nm=names(d)
# nc=ncol(d);nr=nrow(d)
# for(i in 1:nc)
#   if(sum(is.na(d[,i]))==nr) {alln=i;break}
# d=d[,-alln]
# nm=nm[-alln]
# nc=ncol(d)
# R=cor(d,use="pairwise.complete.obs")
# n=nrow(R)
# 
# R=as.data.frame(R)
# names(R)=names(d)
# pheatmap(R)
# readline()
# R=solve(R)
# for(i in 1:ncol(R))
#   for(j in 1:ncol(R))
#     if(i>j)
#       R[i,j]=R[j,i]=-R[i,j]/sqrt(R[i,i]*R[j,j])
# diag(R)=1
# pheatmap(R)
  1. Make your interpretation and conclusion. Save the last heatmap in large size png format file.